In [1]:
from bs4 import BeautifulSoup
import pandas as pd
pd.options.display.max_columns = 100
In [2]:
base_path = "/data/kaggle/evergreen/"
def get_path(filename):
return base_path + filename
In [3]:
df_train = pd.read_csv(get_path("train.tsv"), sep="\t")
df_train.head()
Out[3]:
In [4]:
df_test = pd.read_csv(get_path("test.tsv"), sep="\t")
df_test.head()
Out[4]:
In [5]:
df_train.boilerplate[0]
Out[5]:
In [6]:
df_train.label.value_counts()/len(df_train)
Out[6]:
In [8]:
import json, re
In [9]:
def preprocess(boilerplate):
d = json.loads(boilerplate)
body = d["body"]
if body is not None:
# Remove html tags
text = BeautifulSoup(body.lower(), "html5lib").text
# Replace the occurrences of multiple consecutive whilespaces
# with a single space (" ")
text = re.sub(r"[\W]+", " ", text)
return text
return ""
preprocess(df_train.boilerplate[0])
Out[9]:
In [10]:
%%time
df_train["body"]= df_train.boilerplate.apply(preprocess)
In [11]:
%%time
df_test["body"]= df_test.boilerplate.apply(preprocess)
In [12]:
import nltk
In [13]:
def my_tokenizer(s):
porter = nltk.stem.porter.PorterStemmer()
terms = [porter.stem(w) for w in nltk.word_tokenize(s)]
terms = [term for term in terms if len(term) > 2]
return terms
my_tokenizer("In Yellowstone National Park, warming has brought rapid changes.")
Out[13]:
In [14]:
from sklearn import feature_extraction
In [15]:
stopwords = nltk.corpus.stopwords.words("english")
tfidf = feature_extraction.text.TfidfVectorizer(
tokenizer=my_tokenizer
, stop_words = stopwords
, ngram_range=(1, 1))
In [16]:
%%time
body_train_tfidf = tfidf.fit_transform(df_train.body)
In [17]:
%%time
body_test_tfidf = tfidf.transform(df_test.body)
In [18]:
body_train_tfidf.shape, body_test_tfidf.shape
Out[18]:
In [19]:
type(body_train_tfidf)
Out[19]:
In [20]:
df_train.columns
Out[20]:
In [27]:
columns = ['avglinksize', 'commonlinkratio_1',
'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
'lengthyLinkDomain', 'linkwordscore',
'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
'parametrizedLinkRatio', 'spelling_errors_ratio']
#'news_front_page', 'alchemy_category_score'
X = pd.concat([df_train, df_test])[columns]
X = pd.get_dummies(X, drop_first=True)
print(X.shape)
X_train = X.iloc[:len(df_train), :]
X_test = X.iloc[len(df_train):, :]
X_train.shape, X_test.shape
Out[27]:
In [28]:
from sklearn import preprocessing
In [29]:
import scipy as sp
import numpy as np
In [30]:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
In [31]:
X_train = sp.sparse.hstack((X_train, body_train_tfidf))
X_test = sp.sparse.hstack((X_test, body_test_tfidf))
In [32]:
X_train.shape, X_test.shape
Out[32]:
In [34]:
label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(df_train.label)
#y_test = label_encoder.transform(df_test.label)
In [36]:
from sklearn import linear_model, metrics, model_selection
In [69]:
%%time
X1, X2, y1, y2 = model_selection.train_test_split(X_train, y_train, test_size = 0.3, random_state = 1)
lr = linear_model.LogisticRegression(C = 1.0, random_state = 1, max_iter=10000
, n_jobs = 12, solver="saga")
lr.fit(X1, y1)
y2_pred = lr.predict(X2)
print("Accuracy: ", metrics.accuracy_score(y2, y2_pred))
In [68]:
%%time
lr = linear_model.LogisticRegression(random_state = 1, max_iter=5000
, n_jobs = 12, solver="saga")
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
In [48]:
submission = pd.DataFrame({"urlid": df_test.urlid, "label": y_test_pred})
submission.sample(10)
Out[48]:
In [50]:
submission.to_csv("/tmp/submission.csv", index=False)
In [ ]: